鐵人賽
今日天氣不錯,就讓我們正式開始介紹 Sscofpmf 這個 extension 吧。
現有的 Privilege SPEC 已經定義了 mhpmevent
和 hpmcounter
用來選擇事件以及計算事件 counter,但沒有對這些 CSR 的欄位進行標準化。至少對於 risch-OS 來說,需要對某些基本功能進行標準化。這使得有了標準的上游支援,而無須使用自己定制的軟體支援。
此 extension 名為 Sscofpmf (Ss表示 supervisor-level extension,cofpmf表示 Counter Overflow 以及 Privilege Mode Filtering)
請注意:count overflow interrupt 被視為標準 local interrupt ,它被分配到 mip/mie/sip/sie
的第 13 個 bit 當中。
此 extension 擴展了對 hardware performance monitor(以下簡稱 HPM)的描述,並將 mhpmevent
擴展到 64 bit,如下所示:
HPM 包含 29 個 64 bit event counter 以及 29 個 event selector,也就是 mhpmcounter3~mhpmcounter31
以及 mhpmevent3~mhpmevent31
。
mhpmevent
是 WARL 的暫存器,用於控制是哪個事件並在相對應的 counter(mhpmcounter
)做遞增,以及是否 overflow 的發生,當 mhpmevent =0
表示無事件,並且 counter也不會遞增。
在 RV32 上,access mcycle、minstret、mhpmcounter、mhpmevent
為低位元32 bit,而高位元則為 mcycleh、minstreth、mhpmcounterh、mhpmeventh
, mhpmeventh
的 CSR 編號為 0x723~0x73f
將以下添加到 mhpmevent
bit 當中:
mhpmcounter
或 mhpmevent
不會導致 count overflow,只有從硬體counter 往上增加才會造成 count overflow。counter overflow interrupt 被視為 local interrupt 對應於 mip/mie/sip/sie
的第13個 bit。 mip/sip
LCOFIP 和 mie/sie
LCOFIE 分別是 interrupt pending bit 以及 interrupt enalbe bit
這是為此 extension 所新增的 CSR
CSR number 0xda0
該 CSR 會複製 mphmevent
的 OF bit並放在相對應的位置,該 CSR 可以快速知道有哪些 counter 發生 overflow
接著我們來看看在 kernel 當中我們該如何使用呢?
打開 make menuconfig,照著下面 location 的地方去找尋,將會發現有三個選項,事實上需要把第一個選項開啟,才會有下面兩個,第二個選項表示,過去的 perf 基本上,存在著前幾天所說的限制問題,未來預計會被移除掉,有興趣的可以複習前幾天的文章,第三個選項也就是我們 sscofpmf,同時把二 跟三選項啟用,會直接忽略掉第二個選項
講完要去哪裡啟用了,那來說說在 linux 為了此 extension 新增了什麼 patch 吧~
hpmcounter3~29/hpmcounterh3~29
CSR說完新增的 patch 了,來介紹一下程式碼相關的內容吧,簡單說明一下共用的 funcion 放在 riscv_pmu.c,不共用的便各自實現,legacy放在 riscv_pmu_legacy,SBI PMU 放在 riscv_pmu_sbi.c,下方會以 SBI PMU 做介紹。
讓我們來舉個範例:
for example: $perf stat -e cache-misses ls
的一生
首先一開始 event 初始化
static int riscv_pmu_event_init(struct perf_event *event)
{
/* riscv_pmu.c */
struct hw_perf_event *hwc = &event->hw;
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
int mapped_event;
u64 event_config = 0;
uint64_t cmask;
hwc->flags = 0;
mapped_event = rvpmu->event_map(event, &event_config);
if (mapped_event < 0) {
pr_debug("event %x:%llx not supported\n", event->attr.type,
event->attr.config);
return mapped_event;
}
/*
* idx is set to -1 because the index of a general event should not be
* decided until binding to some counter in pmu->add().
* config will contain the information about counter CSR
* the idx will contain the counter index
*/
hwc->config = event_config;
hwc->idx = -1;
hwc->event_base = mapped_event;
if (!is_sampling_event(event)) {
/*
* For non-sampling runs, limit the sample_period to half
* of the counter width. That way, the new counter value
* is far less likely to overtake the previous one unless
* you have some serious IRQ latency issues.
*/
cmask = riscv_pmu_ctr_get_width_mask(event);
hwc->sample_period = cmask >> 1;
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
}
return 0;
}
開始時會先到 riscv_pmu_event_init,取得一些必要的資訊,透過 rvpmu->event_map 取得 mapped_event 以及 event_config,這邊我們把 rvpmu->event_map 展開會到 pmu_sbi_event_map 這個 function
static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig)
{
/* riscv_pmu_sbi.c */
u32 type = event->attr.type;
u64 config = event->attr.config;
int bSoftware;
u64 raw_config_val;
int ret;
switch (type) {
case PERF_TYPE_HARDWARE:
if (config >= PERF_COUNT_HW_MAX)
return -EINVAL;
ret = pmu_hw_event_map[event->attr.config].event_idx;
break;
case PERF_TYPE_HW_CACHE:
ret = pmu_event_find_cache(config);
break;
case PERF_TYPE_RAW:
/*
* As per SBI specification, the upper 16 bits must be unused for
* a raw event. Use the MSB (63b) to distinguish between hardware
* raw event and firmware events.
*/
bSoftware = config >> 63;
raw_config_val = config & RISCV_PMU_RAW_EVENT_MASK;
if (bSoftware) {
if (raw_config_val < SBI_PMU_FW_MAX)
ret = (raw_config_val & 0xFFFF) |
(SBI_PMU_EVENT_TYPE_FW << 16);
else
return -EINVAL;
} else {
ret = RISCV_PMU_RAW_EVENT_IDX;
*econfig = raw_config_val;
}
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
由於我們的 cache-misses event 是屬於 PERF_TYPE_HARDWARE,因此透過 pmu_hw_event_map[event->attr.config].event_idx 會去取得該 event 的 event_code,並且回傳 ret。接著回到 riscv_pmu_event_init,會將一些資訊放進 hwc 裡面。接著會進到 riscv_pmu_add
static int riscv_pmu_add(struct perf_event *event, int flags)
{
/* riscv_pmu */
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events);
struct hw_perf_event *hwc = &event->hw;
int idx;
idx = rvpmu->ctr_get_idx(event);
if (idx < 0)
return idx;
hwc->idx = idx;
cpuc->events[idx] = event;
cpuc->n_events++;
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (flags & PERF_EF_START)
riscv_pmu_start(event, PERF_EF_RELOAD);
/* Propagate our changes to the userspace mapping. */
perf_event_update_userpage(event);
return 0;
}
該 function 主要是找到目前我們是要使用哪個 counter index,至於如何找到 index則是透過 rvpmu->ctr_get_idx,我們將 rvpmu->ctr_get_idx 展開吧
static int pmu_sbi_ctr_get_idx(struct perf_event *event)
{
/* riscv_pmu_sbi.c */
struct hw_perf_event *hwc = &event->hw;
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events);
struct sbiret ret;
int idx;
uint64_t cbase = 0;
uint64_t cmask = GENMASK_ULL(rvpmu->num_counters - 1, 0);
unsigned long cflags = 0;
if (event->attr.exclude_kernel)
cflags |= SBI_PMU_CFG_FLAG_SET_SINH;
if (event->attr.exclude_user)
cflags |= SBI_PMU_CFG_FLAG_SET_UINH;
/* retrieve the available counter index */
ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase, cmask,
cflags, hwc->event_base, hwc->config, 0);
if (ret.error) {
pr_debug("Not able to find a counter for event %lx config %llx\n",
hwc->event_base, hwc->config);
return sbi_err_map_linux_errno(ret.error);
}
idx = ret.value;
if (idx >= rvpmu->num_counters || !pmu_ctr_list[idx].value)
return -ENOENT;
/* Additional sanity check for the counter id */
if (pmu_sbi_ctr_is_fw(idx)) {
if (!test_and_set_bit(idx, cpuc->used_fw_ctrs))
return idx;
} else {
if (!test_and_set_bit(idx, cpuc->used_hw_ctrs))
return idx;
}
return -ENOENT;
}
我們會透過 sbi_ecall 當中的 SBI_EXT_PMU_COUNTER_CFG_MATCH 去找當初在 dts 設定這個 event 是使用哪個 counter index,簡單來說就是要在 dts 先設定好要使用的 event 要使用何種 counter index,據體如何加入到 dts 可以參考連結,取得 index 後回到riscv_pmu_add ,接著進入到 riscv_pmu_start
static void riscv_pmu_start(struct perf_event *event, int flags)
{
/* riscv_pmu.c */
struct hw_perf_event *hwc = &event->hw;
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
uint64_t max_period = riscv_pmu_ctr_get_width_mask(event);
u64 init_val;
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
return;
if (flags & PERF_EF_RELOAD)
WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
hwc->state = 0;
riscv_pmu_event_set_period(event);
init_val = local64_read(&hwc->prev_count) & max_period;
rvpmu->ctr_start(event, init_val);
perf_event_update_userpage(event);
}
準備開始計算 event counter,透過 rvpmu->csr_start,將初始化初始值,並且開始計算event counter,結束之後準備收尾,也就是進入到 riscv_pmu_del
static void riscv_pmu_del(struct perf_event *event, int flags)
{
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events);
struct hw_perf_event *hwc = &event->hw;
riscv_pmu_stop(event, PERF_EF_UPDATE);
cpuc->events[hwc->idx] = NULL;
/* The firmware need to reset the counter mapping */
if (rvpmu->ctr_stop)
rvpmu->ctr_stop(event, RISCV_PMU_STOP_FLAG_RESET);
cpuc->n_events--;
if (rvpmu->ctr_clear_idx)
rvpmu->ctr_clear_idx(event);
perf_event_update_userpage(event);
hwc->idx = -1;
}
透過 riscv_pmu_del 準備將 event 移除,並且釋放 counter,其中會先進去 riscv_pmu_stop
static void riscv_pmu_stop(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
if (!(hwc->state & PERF_HES_STOPPED)) {
if (rvpmu->ctr_stop) {
rvpmu->ctr_stop(event, 0);
hwc->state |= PERF_HES_STOPPED;
}
riscv_pmu_event_update(event);
hwc->state |= PERF_HES_UPTODATE;
}
}
最後進入到 riscv_pmu_stop,將 counter 停住,並且透過 riscv_pmu_event_update 去將 counter 取出來並且更新
u64 riscv_pmu_event_update(struct perf_event *event)
{
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
struct hw_perf_event *hwc = &event->hw;
u64 prev_raw_count, new_raw_count;
unsigned long cmask;
u64 oldval, delta;
if (!rvpmu->ctr_read)
return 0;
cmask = riscv_pmu_ctr_get_width_mask(event);
do {
prev_raw_count = local64_read(&hwc->prev_count);
new_raw_count = rvpmu->ctr_read(event);
oldval = local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count);
} while (oldval != prev_raw_count);
delta = (new_raw_count - prev_raw_count) & cmask;
local64_add(delta, &event->count);
local64_sub(delta, &hwc->period_left);
return delta;
}
其中可以發現透過 rvpmu->ctr_read 會取得新counter的次數,最後將他們相減會得出 delta 回傳
我們將 rvpmu->ctr_read 展開一下,究竟如何讀取的
static u64 pmu_sbi_ctr_read(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
struct sbiret ret;
union sbi_pmu_ctr_info info;
u64 val = 0;
if (pmu_sbi_is_fw_event(event)) {
ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ,
hwc->idx, 0, 0, 0, 0, 0);
if (!ret.error)
val = ret.value;
} else {
info = pmu_ctr_list[idx];
val = riscv_pmu_ctr_read_csr(info.csr);
if (IS_ENABLED(CONFIG_32BIT))
val = ((u64)riscv_pmu_ctr_read_csr(info.csr + 0x80)) << 31 | val;
}
return val;
}
可以發現當是 fw_event 時,會透過 sbi_ecall 使用 SBI_EXT_PMU_COUNTER_FW_READ ,而其他 event 則是直接對其 hpmcounter
進行讀取
他的一生大概就醬了~
今日介紹完risc-v統一的作法,比起 andes 的方法,使用的csr 數量較少,且考慮較周全,許多實作搬移到 opensbi 進行處理,架構看起來較完整,但筆者認為 andes 的作法也不差,想想在統一作法之前,皆是使用這種作法,便讓人感覺到很佩服呢~
好了,明天來說說這幾天的心得吧,大家明天見。